In [1]:
import matplotlib.pyplot as plt
In [2]:
import json
import zipfile  
import pandas as pd
In [3]:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))
In [4]:
from pyspark.sql import SparkSession
In [5]:
spark = SparkSession.builder.master("local[4]") \
                    .appName('complaints_analysis') \
                    .config("spark.driver.memory", "15g") \
                    .getOrCreate()
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/12/06 21:34:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
In [6]:
data = None  
with zipfile.ZipFile("complaints.json.zip", "r") as z:
    for filename in z.namelist():  
        print(filename)
        with z.open(filename) as f:  
            data = f.read()
            complaints_df = spark.read.json(filename, multiLine=True) #json.loads(data.decode("utf-8"))

# complaints_df = pd.DataFrame(complaints)
# complaints_df.head()
complaints.json
                                                                                
In [7]:
print(f'The number of complaints in dataset is {complaints_df.count()}')
print(f'The number of features in dataset is {len(complaints_df.columns)}')
[Stage 1:>                                                          (0 + 1) / 1]
The number of complaints in dataset is 3127776
The number of features in dataset is 18
                                                                                
In [8]:
complaints_df.show()
[Stage 4:>                                                          (0 + 1) / 1]
+--------------------+-----------------------+--------------------+------------+-----------------------+-------------------------+-----------------+-------------+--------------------+--------------------+--------------------+-----+--------------------+----------------+-------------+-------------+------+--------+
|             company|company_public_response|    company_response|complaint_id|complaint_what_happened|consumer_consent_provided|consumer_disputed|date_received|date_sent_to_company|               issue|             product|state|           sub_issue|     sub_product|submitted_via|         tags|timely|zip_code|
+--------------------+-----------------------+--------------------+------------+-----------------------+-------------------------+-----------------+-------------+--------------------+--------------------+--------------------+-----+--------------------+----------------+-------------+-------------+------+--------+
|Experian Informat...|   Company has respo...|Closed with non-m...|     6227407|   The unverified ac...|         Consent provided|              N/A|   2022-11-22|          2022-11-22|Incorrect informa...|Credit reporting,...|   FL|Information belon...|Credit reporting|          Web|             |   Yes|   32839|
|       EQUIFAX, INC.|                       |         In progress|     6254497|                       |                         |              N/A|   2022-11-29|          2022-11-29|Problem with a cr...|Credit reporting,...|   CA|Investigation too...|Credit reporting|          Web|             |   Yes|   90045|
|Valley Collection...|   Company believes ...|Closed with expla...|     6250329|                       |                    Other|              N/A|   2022-11-29|          2022-11-29|Attempts to colle...|     Debt collection|   AZ|   Debt is not yours|    Medical debt|          Web|             |   Yes|   85234|
|       EQUIFAX, INC.|                       |         In progress|     6254320|                       |                         |              N/A|   2022-11-29|          2022-11-29|Incorrect informa...|Credit reporting,...|   GA|Information belon...|Credit reporting|          Web|             |   Yes|   30097|
|      CITIBANK, N.A.|   Company has respo...|Closed with expla...|     6251994|                       |     Consent not provided|              N/A|   2022-11-28|          2022-11-28|Communication tac...|     Debt collection|   GA|Frequent or repea...|Credit card debt|          Web|             |   Yes|   30315|
|Viking Client Ser...|                       |Closed with expla...|     6249435|                       |                    Other|              N/A|   2022-11-25|          2022-11-25|Threatened to con...|     Debt collection|   FL|Talked to a third...|   I do not know|          Web|             |   Yes|   33025|
|Experian Informat...|   Company has respo...|Closed with expla...|     6240644|                       |     Consent not provided|              N/A|   2022-11-25|          2022-11-25|Improper use of y...|Credit reporting,...|   AL|Reporting company...|Credit reporting|          Web|             |   Yes|   35802|
|Experian Informat...|   Company has respo...|Closed with expla...|     6240781|                       |                    Other|              N/A|   2022-11-25|          2022-11-25|Incorrect informa...|Credit reporting,...|   MS|Information belon...|Credit reporting|          Web|             |   Yes|   39735|
|Experian Informat...|                       |         In progress|     6241534|                       |                         |              N/A|   2022-11-24|          2022-11-24|Improper use of y...|Credit reporting,...|   TX|Reporting company...|Credit reporting|          Web|             |   Yes|   75056|
|HYUNDAI CAPITAL A...|   Company has respo...|Closed with expla...|     6241749|                       |     Consent not provided|              N/A|   2022-11-24|          2022-11-24|Problems at the e...|Vehicle loan or l...|   FL|Unable to receive...|            Loan|          Web|Servicemember|   Yes|   33130|
|NCB Management Se...|                       |Closed with expla...|     6128313|                       |                         |              N/A|   2022-10-25|          2022-10-25|Attempts to colle...|     Debt collection|   LA|   Debt is not yours|   I do not know|          Web|             |   Yes|   70817|
|JPMORGAN CHASE & CO.|                       |Closed with expla...|     6206946|                       |                         |              N/A|   2022-11-15|          2022-11-15|Problem with a le...|Checking or savin...|   NY|Transaction was n...|Checking account|          Web|             |   Yes|   10027|
|       EQUIFAX, INC.|                       |         In progress|     6257631|                       |                         |              N/A|   2022-11-29|          2022-11-29|Problem with a cr...|Credit reporting,...|   OH|Their investigati...|Credit reporting|          Web|             |   Yes|   45251|
|       EQUIFAX, INC.|                       |         In progress|     6256727|                       |                    Other|              N/A|   2022-11-29|          2022-11-29|Incorrect informa...|Credit reporting,...|   IL|Information belon...|Credit reporting|          Web|             |   Yes|   60429|
|FirstPoint Collec...|                       |Closed with expla...|     6205781|                       |                         |              N/A|   2022-11-16|          2022-11-16|Attempts to colle...|     Debt collection|   NC|   Debt is not yours|    Medical debt|          Web|             |    No|   27406|
|       EQUIFAX, INC.|                       |         In progress|     6254379|                       |                         |              N/A|   2022-11-29|          2022-11-29|Problem with a cr...|Credit reporting,...|   PA|Their investigati...|Credit reporting|          Web|             |   Yes|   19462|
|       EQUIFAX, INC.|                       |         In progress|     6254402|                       |                         |              N/A|   2022-11-29|          2022-11-29|Problem with a cr...|Credit reporting,...|   TX|Investigation too...|Credit reporting|          Web|             |   Yes|   75801|
|       EQUIFAX, INC.|                       |         In progress|     6254475|                       |                         |              N/A|   2022-11-29|          2022-11-29|Incorrect informa...|Credit reporting,...|   FL|Information belon...|Credit reporting|          Web|             |   Yes|   32818|
|       EQUIFAX, INC.|                       |         In progress|     6254486|                       |                         |              N/A|   2022-11-29|          2022-11-29|Improper use of y...|Credit reporting,...|   MI|Reporting company...|Credit reporting|          Web|             |   Yes|   48125|
|HCFS Health Care ...|                       |         In progress|     6242110|                       |                    Other|              N/A|   2022-11-24|          2022-11-24|Attempts to colle...|     Debt collection|   OH|   Debt is not yours|    Medical debt|          Web|Servicemember|   Yes|   43123|
+--------------------+-----------------------+--------------------+------------+-----------------------+-------------------------+-----------------+-------------+--------------------+--------------------+--------------------+-----+--------------------+----------------+-------------+-------------+------+--------+
only showing top 20 rows

                                                                                
In [9]:
complaints_df.printSchema()
root
 |-- company: string (nullable = true)
 |-- company_public_response: string (nullable = true)
 |-- company_response: string (nullable = true)
 |-- complaint_id: string (nullable = true)
 |-- complaint_what_happened: string (nullable = true)
 |-- consumer_consent_provided: string (nullable = true)
 |-- consumer_disputed: string (nullable = true)
 |-- date_received: string (nullable = true)
 |-- date_sent_to_company: string (nullable = true)
 |-- issue: string (nullable = true)
 |-- product: string (nullable = true)
 |-- state: string (nullable = true)
 |-- sub_issue: string (nullable = true)
 |-- sub_product: string (nullable = true)
 |-- submitted_via: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- timely: string (nullable = true)
 |-- zip_code: string (nullable = true)

Dataset has two date columns but they are in string format. I will convert them to datetime data type

In [10]:
from pyspark.sql.functions import col, to_date, min, max, countDistinct, count, desc, year
In [11]:
complaints_df = complaints_df.withColumn("date_received", to_date("date_received"))
complaints_df = complaints_df.withColumn("date_sent_to_company", to_date("date_sent_to_company"))
complaints_df.show()
[Stage 5:>                                                          (0 + 1) / 1]
+--------------------+-----------------------+--------------------+------------+-----------------------+-------------------------+-----------------+-------------+--------------------+--------------------+--------------------+-----+--------------------+----------------+-------------+-------------+------+--------+
|             company|company_public_response|    company_response|complaint_id|complaint_what_happened|consumer_consent_provided|consumer_disputed|date_received|date_sent_to_company|               issue|             product|state|           sub_issue|     sub_product|submitted_via|         tags|timely|zip_code|
+--------------------+-----------------------+--------------------+------------+-----------------------+-------------------------+-----------------+-------------+--------------------+--------------------+--------------------+-----+--------------------+----------------+-------------+-------------+------+--------+
|Experian Informat...|   Company has respo...|Closed with non-m...|     6227407|   The unverified ac...|         Consent provided|              N/A|   2022-11-22|          2022-11-22|Incorrect informa...|Credit reporting,...|   FL|Information belon...|Credit reporting|          Web|             |   Yes|   32839|
|       EQUIFAX, INC.|                       |         In progress|     6254497|                       |                         |              N/A|   2022-11-29|          2022-11-29|Problem with a cr...|Credit reporting,...|   CA|Investigation too...|Credit reporting|          Web|             |   Yes|   90045|
|Valley Collection...|   Company believes ...|Closed with expla...|     6250329|                       |                    Other|              N/A|   2022-11-29|          2022-11-29|Attempts to colle...|     Debt collection|   AZ|   Debt is not yours|    Medical debt|          Web|             |   Yes|   85234|
|       EQUIFAX, INC.|                       |         In progress|     6254320|                       |                         |              N/A|   2022-11-29|          2022-11-29|Incorrect informa...|Credit reporting,...|   GA|Information belon...|Credit reporting|          Web|             |   Yes|   30097|
|      CITIBANK, N.A.|   Company has respo...|Closed with expla...|     6251994|                       |     Consent not provided|              N/A|   2022-11-28|          2022-11-28|Communication tac...|     Debt collection|   GA|Frequent or repea...|Credit card debt|          Web|             |   Yes|   30315|
|Viking Client Ser...|                       |Closed with expla...|     6249435|                       |                    Other|              N/A|   2022-11-25|          2022-11-25|Threatened to con...|     Debt collection|   FL|Talked to a third...|   I do not know|          Web|             |   Yes|   33025|
|Experian Informat...|   Company has respo...|Closed with expla...|     6240644|                       |     Consent not provided|              N/A|   2022-11-25|          2022-11-25|Improper use of y...|Credit reporting,...|   AL|Reporting company...|Credit reporting|          Web|             |   Yes|   35802|
|Experian Informat...|   Company has respo...|Closed with expla...|     6240781|                       |                    Other|              N/A|   2022-11-25|          2022-11-25|Incorrect informa...|Credit reporting,...|   MS|Information belon...|Credit reporting|          Web|             |   Yes|   39735|
|Experian Informat...|                       |         In progress|     6241534|                       |                         |              N/A|   2022-11-24|          2022-11-24|Improper use of y...|Credit reporting,...|   TX|Reporting company...|Credit reporting|          Web|             |   Yes|   75056|
|HYUNDAI CAPITAL A...|   Company has respo...|Closed with expla...|     6241749|                       |     Consent not provided|              N/A|   2022-11-24|          2022-11-24|Problems at the e...|Vehicle loan or l...|   FL|Unable to receive...|            Loan|          Web|Servicemember|   Yes|   33130|
|NCB Management Se...|                       |Closed with expla...|     6128313|                       |                         |              N/A|   2022-10-25|          2022-10-25|Attempts to colle...|     Debt collection|   LA|   Debt is not yours|   I do not know|          Web|             |   Yes|   70817|
|JPMORGAN CHASE & CO.|                       |Closed with expla...|     6206946|                       |                         |              N/A|   2022-11-15|          2022-11-15|Problem with a le...|Checking or savin...|   NY|Transaction was n...|Checking account|          Web|             |   Yes|   10027|
|       EQUIFAX, INC.|                       |         In progress|     6257631|                       |                         |              N/A|   2022-11-29|          2022-11-29|Problem with a cr...|Credit reporting,...|   OH|Their investigati...|Credit reporting|          Web|             |   Yes|   45251|
|       EQUIFAX, INC.|                       |         In progress|     6256727|                       |                    Other|              N/A|   2022-11-29|          2022-11-29|Incorrect informa...|Credit reporting,...|   IL|Information belon...|Credit reporting|          Web|             |   Yes|   60429|
|FirstPoint Collec...|                       |Closed with expla...|     6205781|                       |                         |              N/A|   2022-11-16|          2022-11-16|Attempts to colle...|     Debt collection|   NC|   Debt is not yours|    Medical debt|          Web|             |    No|   27406|
|       EQUIFAX, INC.|                       |         In progress|     6254379|                       |                         |              N/A|   2022-11-29|          2022-11-29|Problem with a cr...|Credit reporting,...|   PA|Their investigati...|Credit reporting|          Web|             |   Yes|   19462|
|       EQUIFAX, INC.|                       |         In progress|     6254402|                       |                         |              N/A|   2022-11-29|          2022-11-29|Problem with a cr...|Credit reporting,...|   TX|Investigation too...|Credit reporting|          Web|             |   Yes|   75801|
|       EQUIFAX, INC.|                       |         In progress|     6254475|                       |                         |              N/A|   2022-11-29|          2022-11-29|Incorrect informa...|Credit reporting,...|   FL|Information belon...|Credit reporting|          Web|             |   Yes|   32818|
|       EQUIFAX, INC.|                       |         In progress|     6254486|                       |                         |              N/A|   2022-11-29|          2022-11-29|Improper use of y...|Credit reporting,...|   MI|Reporting company...|Credit reporting|          Web|             |   Yes|   48125|
|HCFS Health Care ...|                       |         In progress|     6242110|                       |                    Other|              N/A|   2022-11-24|          2022-11-24|Attempts to colle...|     Debt collection|   OH|   Debt is not yours|    Medical debt|          Web|Servicemember|   Yes|   43123|
+--------------------+-----------------------+--------------------+------------+-----------------------+-------------------------+-----------------+-------------+--------------------+--------------------+--------------------+-----+--------------------+----------------+-------------+-------------+------+--------+
only showing top 20 rows

                                                                                
In [12]:
complaints_df.printSchema()
root
 |-- company: string (nullable = true)
 |-- company_public_response: string (nullable = true)
 |-- company_response: string (nullable = true)
 |-- complaint_id: string (nullable = true)
 |-- complaint_what_happened: string (nullable = true)
 |-- consumer_consent_provided: string (nullable = true)
 |-- consumer_disputed: string (nullable = true)
 |-- date_received: date (nullable = true)
 |-- date_sent_to_company: date (nullable = true)
 |-- issue: string (nullable = true)
 |-- product: string (nullable = true)
 |-- state: string (nullable = true)
 |-- sub_issue: string (nullable = true)
 |-- sub_product: string (nullable = true)
 |-- submitted_via: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- timely: string (nullable = true)
 |-- zip_code: string (nullable = true)

In [13]:
min_date, max_date = complaints_df.select(min("date_received"), max("date_received")).first()
print(f'The range of the complaints received date is {min_date.strftime("%m/%d/%Y")} and {max_date.strftime("%m/%d/%Y")}')
[Stage 6:>                                                          (0 + 1) / 1]
The range of the complaints received date is 12/01/2011 and 12/05/2022
                                                                                
In [14]:
print(f'The total number of companies in the dataset is {complaints_df.select(countDistinct("company")).collect()[0][0]}')
[Stage 9:>                                                          (0 + 1) / 1]
The total number of companies in the dataset is 6581
                                                                                
In [15]:
ComplaintsCountPerCompany = complaints_df.groupBy("company")\
                                    .agg(count("*").alias("ComplaintsCount"))\
                                    .sort(desc("ComplaintsCount"))
ComplaintsCountPerCompany.show()
[Stage 15:>                                                         (0 + 1) / 1]
+--------------------+---------------+
|             company|ComplaintsCount|
+--------------------+---------------+
|       EQUIFAX, INC.|         538662|
|TRANSUNION INTERM...|         443688|
|Experian Informat...|         415222|
|BANK OF AMERICA, ...|         120194|
|WELLS FARGO & COM...|         101653|
|JPMORGAN CHASE & CO.|          96503|
|      CITIBANK, N.A.|          79735|
|CAPITAL ONE FINAN...|          74032|
| SYNCHRONY FINANCIAL|          40366|
|Navient Solutions...|          37508|
|Ocwen Financial C...|          33247|
|        U.S. BANCORP|          30257|
| NATIONSTAR MORTGAGE|          26881|
|AMERICAN EXPRESS ...|          25578|
|PORTFOLIO RECOVER...|          22299|
|       DISCOVER BANK|          21496|
|       PNC Bank N.A.|          21391|
|ENCORE CAPITAL GR...|          19989|
|TD BANK US HOLDIN...|          19627|
|Bread Financial H...|          19279|
+--------------------+---------------+
only showing top 20 rows

                                                                                
In [16]:
pl = ComplaintsCountPerCompany.toPandas().head(10).plot(kind="bar", 
                            x="company", y="ComplaintsCount", 
                            figsize=(10, 7), alpha=0.5, color="olive")
pl.set_xlabel("Company")
pl.set_ylabel("Number of complaints")
pl.set_title("Number of complaints by Country")
plt.show()
                                                                                
In [17]:
ComplaintsCountPerYear = complaints_df.groupBy(year("date_received").alias("year"))\
                                    .agg(count("*").alias("ComplaintsCount"))\
                                    .sort("year")
ComplaintsCountPerYear.show()
[Stage 26:>                                                         (0 + 1) / 1]
+----+---------------+
|year|ComplaintsCount|
+----+---------------+
|2011|           2536|
|2012|          72373|
|2013|         108217|
|2014|         153034|
|2015|         168466|
|2016|         191457|
|2017|         242931|
|2018|         257252|
|2019|         277324|
|2020|         444329|
|2021|         496019|
|2022|         713838|
+----+---------------+

                                                                                
In [18]:
pl = ComplaintsCountPerYear.toPandas().head(10).plot(kind="bar", 
                            x="year", y="ComplaintsCount", 
                            figsize=(10, 7), alpha=0.5, color="olive")
pl.set_xlabel("Company")
pl.set_ylabel("Number of complaints")
pl.set_title("Number of complaints by Country")
plt.show()
                                                                                
In [19]:
ComplaintsCountPerProduct = complaints_df.groupBy("product")\
                                    .agg(count("*").alias("ComplaintsCount"))\
                                    .sort(desc("ComplaintsCount"))
ComplaintsCountPerProduct.show()
[Stage 37:>                                                         (0 + 1) / 1]
+--------------------+---------------+
|             product|ComplaintsCount|
+--------------------+---------------+
|Credit reporting,...|        1452854|
|     Debt collection|         454053|
|            Mortgage|         365651|
|Credit card or pr...|         166567|
|Checking or savin...|         142819|
|    Credit reporting|         140429|
|         Credit card|          89190|
|Bank account or s...|          86206|
|        Student loan|          71721|
|Money transfer, v...|          48122|
|Vehicle loan or l...|          37409|
|       Consumer Loan|          31596|
|Payday loan, titl...|          25367|
|         Payday loan|           5543|
|     Money transfers|           5354|
|        Prepaid card|           3819|
|Other financial s...|           1058|
|    Virtual currency|             18|
+--------------------+---------------+

                                                                                
In [20]:
pl = ComplaintsCountPerProduct.toPandas().plot(kind="bar", 
                            x="product", y="ComplaintsCount", 
                            figsize=(10, 7), log=True, alpha=0.5, color="olive")
pl.set_xlabel("Product")
pl.set_ylabel("Number of complaints (Log scale)")
pl.set_title("Number of complaints by product")
plt.show()
                                                                                
In [21]:
print(f'The total number of products in the dataset is {complaints_df.select(countDistinct("product")).collect()[0][0]}')
[Stage 48:>                                                         (0 + 1) / 1]
The total number of products in the dataset is 18
                                                                                
In [84]:
complaints_df.select(countDistinct("company_public_response")).collect()[0][0]
                                                                                
Out[84]:
12
In [89]:
company_public_response_count = complaints_df.groupBy("company_public_response")\
                                    .agg(count("*").alias("ComplaintsCount"))\
                                    .sort(desc("ComplaintsCount"))
company_public_response_count.show(truncate=False)
[Stage 176:>                                                        (0 + 1) / 1]
+-----------------------------------------------------------------------------------------------------------------------+---------------+
|company_public_response                                                                                                |ComplaintsCount|
+-----------------------------------------------------------------------------------------------------------------------+---------------+
|                                                                                                                       |1766304        |
|Company has responded to the consumer and the CFPB and chooses not to provide a public response                        |1153102        |
|Company believes it acted appropriately as authorized by contract or law                                               |112514         |
|Company chooses not to provide a public response                                                                       |52473          |
|Company believes the complaint is the result of a misunderstanding                                                     |11023          |
|Company disputes the facts presented in the complaint                                                                  |8979           |
|Company believes complaint caused principally by actions of third party outside the control or direction of the company|6973           |
|Company believes complaint is the result of an isolated error                                                          |5650           |
|Company believes complaint represents an opportunity for improvement to better serve consumers                         |4242           |
|Company can't verify or dispute the facts in the complaint                                                             |3860           |
|Company believes the complaint provided an opportunity to answer consumer's questions                                  |2557           |
|Company believes complaint relates to a discontinued policy or procedure                                               |99             |
+-----------------------------------------------------------------------------------------------------------------------+---------------+

                                                                                
In [93]:
company_response_count = complaints_df.groupBy("company_response")\
                                    .agg(count("*").alias("count"))\
                                    .sort(desc("count"))

company_response_count.show(truncate=False)
[Stage 185:>                                                        (0 + 1) / 1]
+-------------------------------+-------+
|company_response               |count  |
+-------------------------------+-------+
|Closed with explanation        |2417132|
|Closed with non-monetary relief|466957 |
|Closed with monetary relief    |123524 |
|In progress                    |70021  |
|Closed without relief          |17868  |
|Closed                         |17611  |
|Untimely response              |9355   |
|Closed with relief             |5304   |
|                               |4      |
+-------------------------------+-------+

                                                                                
In [101]:
complaints_df.select("submitted_via").distinct().show()
[Stage 209:>                                                        (0 + 1) / 1]
+-------------+
|submitted_via|
+-------------+
|        Phone|
|          Fax|
|        Email|
| Web Referral|
|     Referral|
|  Postal mail|
|          Web|
+-------------+

                                                                                
In [102]:
complaints_df.select('issue').distinct().show()
[Stage 212:>                                                        (0 + 1) / 1]
+--------------------+
|               issue|
+--------------------+
|Communication tac...|
|Application proce...|
|Unexpected or oth...|
|Advertising and m...|
|Problem with an o...|
|Getting a loan or...|
|Balance transfer fee|
|Customer service/...|
|        Adding money|
|Credit card prote...|
|Closing/Cancellin...|
|Received a loan I...|
|Vehicle was repos...|
|Can't stop charge...|
|                Fees|
|          Bankruptcy|
|Forbearance / Wor...|
|Credit determination|
|Confusing or miss...|
|Loan modification...|
+--------------------+
only showing top 20 rows

                                                                                
In [103]:
complaints_df.select('state').distinct().show()
[Stage 215:>                                                        (0 + 1) / 1]
+--------------------+
|               state|
+--------------------+
|UNITED STATES MIN...|
|                  AZ|
|                  SC|
|                  LA|
|                  MN|
|                  AA|
|                  NJ|
|                  DC|
|                  OR|
|                  VA|
|                  RI|
|                  KY|
|                  WY|
|                  NH|
|                  MI|
|                  NV|
|                  WI|
|                  ID|
|                  CA|
|                  CT|
+--------------------+
only showing top 20 rows

                                                                                
In [105]:
print(f'The total number of state in the dataset is {complaints_df.select(countDistinct("state")).collect()[0][0]}')
[Stage 224:>                                                        (0 + 1) / 1]
The total number of state in the dataset is 64
                                                                                
In [22]:
statewise_complaints_count = complaints_df.groupBy("state")\
                                    .agg(count("*").alias("count"))\
                                    .sort(desc("count"))

statewise_complaints_count.show(truncate=False)
[Stage 54:>                                                         (0 + 1) / 1]
+-----+------+
|state|count |
+-----+------+
|CA   |381389|
|FL   |350313|
|TX   |305289|
|NY   |204477|
|GA   |197188|
|PA   |141286|
|IL   |119176|
|NJ   |107395|
|NC   |98699 |
|OH   |84040 |
|MD   |83916 |
|VA   |79859 |
|MI   |69414 |
|TN   |61064 |
|AL   |57789 |
|AZ   |57321 |
|SC   |54923 |
|LA   |49813 |
|MA   |49303 |
|WA   |44082 |
+-----+------+
only showing top 20 rows

                                                                                
In [23]:
import plotly.express as px
fig = px.choropleth(statewise_complaints_count.toPandas(),
                    locations='state', 
                    locationmode="USA-states", 
                    scope="usa",
                    color='count',
                    color_continuous_scale="Viridis_r", 
                   )

fig.update_layout(
      title_text = 'Number of complaints by State (2011-2020)',
      title_font_family="Times New Roman",
      title_font_size = 22,
      title_font_color="black", 
      title_x=0.45, 
         )


fig.show()
                                                                                
In [24]:
statewise_complaints_per_day_count = complaints_df.groupBy(["state", year("date_received").alias('year')])\
                                    .agg(count("*").alias("count"))\
                                    .sort("year")

statewise_complaints_per_day_count.show(truncate=False)
[Stage 65:>                                                         (0 + 1) / 1]
+-----+----+-----+
|state|year|count|
+-----+----+-----+
|SC   |2011|21   |
|LA   |2011|12   |
|NE   |2011|4    |
|IA   |2011|13   |
|GA   |2011|99   |
|DC   |2011|11   |
|WA   |2011|64   |
|WI   |2011|24   |
|KS   |2011|15   |
|ND   |2011|1    |
|HI   |2011|13   |
|DE   |2011|15   |
|AK   |2011|5    |
|AR   |2011|9    |
|MA   |2011|71   |
|WV   |2011|7    |
|RI   |2011|10   |
|SD   |2011|10   |
|OH   |2011|69   |
|TN   |2011|37   |
+-----+----+-----+
only showing top 20 rows

                                                                                
In [25]:
df = statewise_complaints_per_day_count.toPandas()
df.head()
                                                                                
Out[25]:
state year count
0 MT 2011 7
1 IA 2011 13
2 TN 2011 37
3 DC 2011 11
4 NV 2011 18
In [26]:
df['year'] = df['year'].astype(str)
df.head()
Out[26]:
state year count
0 MT 2011 7
1 IA 2011 13
2 TN 2011 37
3 DC 2011 11
4 NV 2011 18
In [33]:
fig = px.choropleth(df,
                    locations='state', 
                    locationmode="USA-states", 
                    color='count',
                    color_continuous_scale="Viridis_r", 
                    scope="usa",
                    animation_frame='year') 


fig.update_layout(
      title_text = 'Number of complaints by State (2011-2020)',
      title_font_family="Times New Roman",
      title_font_size = 22,
      title_font_color="black", 
      title_x=0.45, 
    )


fig.show()
In [34]:
spark.stop()